Note: Using vst normalisation (just for performing the DEA from the limma package) and filtering out genes with a raw expression sum below 40

# Load csvs
datExpr = read.csv('./../raw_data/RNAseq_ASD_datExpr.csv', row.names=1)
datMeta = read.csv('./../raw_data/RNAseq_ASD_datMeta.csv')
SFARI_genes = read_csv('./../working_data/SFARI_genes_01-15-2019.csv')

# Make sure datExpr and datMeta columns/rows match
rownames(datMeta) = paste0('X', datMeta$Dissected_Sample_ID)
if(!all(colnames(datExpr) == rownames(datMeta))){
  print('Columns in datExpr don\'t match the rowd in datMeta!')
}

# Annotate probes
getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
            'end_position','strand','band','gene_biotype','percentage_gc_content')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
               dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19
datProbes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
datProbes = datProbes[match(rownames(datExpr), datProbes$ensembl_gene_id),]
datProbes$length = datProbes$end_position-datProbes$start_position

# Group brain regions by lobes
datMeta$Brain_Region = as.factor(datMeta$Region)
datMeta$Brain_lobe = 'Occipital'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA4_6', 'BA9', 'BA24', 'BA44_45')] = 'Frontal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA3_1_2_5', 'BA7')] = 'Parietal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA38', 'BA39_40', 'BA20_37', 'BA41_42_22')] = 'Temporal'
datMeta$Brain_lobe=factor(datMeta$Brain_lobe, levels=c('Frontal', 'Temporal', 'Parietal', 'Occipital'))

#################################################################################
# FILTERS:

# 1 Filter probes with start or end position missing (filter 5)
# These can be filtered without probe info, they have weird IDS that don't start with ENS
to_keep = !is.na(datProbes$length)
datProbes = datProbes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datProbes) = datProbes$ensembl_gene_id

# 2. Filter samples from ID AN03345 (filter 2)
to_keep = (datMeta$Subject_ID != 'AN03345')
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]

# 3. Filter samples with rowSums <= 40
to_keep = rowSums(datExpr)>40
datExpr = datExpr[to_keep,]
datProbes = datProbes[to_keep,]

#################################################################################
# Annotate SFARI genes

# Get ensemble IDS for SFARI genes
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
               dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19

gene_names = getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol'), filters=c('hgnc_symbol'), 
                   values=SFARI_genes$`gene-symbol`, mart=mart) %>% 
                   mutate('gene-symbol'=hgnc_symbol, 'ID'=as.character(ensembl_gene_id)) %>% 
                   dplyr::select('ID', 'gene-symbol')

SFARI_genes = left_join(SFARI_genes, gene_names, by='gene-symbol')

#################################################################################
# Add functional annotation to genes from GO

GO_annotations = read.csv('./../working_data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID' = as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal' = 1)


datExpr_backup = datExpr

rm(getinfo, to_keep, gene_names, mart, GO_annotations)

Number of genes:

nrow(datExpr)
## [1] 33478
################################################################################
# Calculate Differential Expression using limma and DESeq2 packages

# Limma
if(!file.exists('./../working_data/genes_ASD_DE_info_limma.csv')){
  
  # First perform vst normalisation
  counts = as.matrix(datExpr)
  rowRanges = GRanges(datProbes$chromosome_name,
                      IRanges(datProbes$start_position, width=datProbes$length),
                      strand=datProbes$strand,
                      feature_id=datProbes$ensembl_gene_id)
  
  se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
  dds = DESeqDataSet(se, design =~Diagnosis_)
  dds = estimateSizeFactors(dds)
  vst_output = vst(dds)
  datExpr_vst = assay(vst_output)
  
  # Calculate DE of normalised data
  mod = model.matrix(~ Diagnosis_, data=datMeta)
  corfit = duplicateCorrelation(datExpr_vst, mod, block=datMeta$Subject_ID)
  lmfit = lmFit(datExpr_vst, design=mod, block=datMeta$Subject_ID, correlation=corfit$consensus)
  fit = eBayes(lmfit, trend=T, robust=T)
  top_genes = topTable(fit, coef=2, number=nrow(datExpr_vst))
  DE_info_limma = top_genes[match(rownames(datExpr_vst), rownames(top_genes)),] %>%
                  rownames_to_column(var = 'ID') %>%
                  mutate('logFC_limma'=logFC, 'adj.P.Val_limma'=adj.P.Val) %>%
                  dplyr::select(ID, logFC_limma, adj.P.Val_limma)
  
  write.csv(DE_info_limma, './../working_data/genes_ASD_DE_info_limma.csv', row.names = FALSE)
  
  rm(mod, corfit, lmfit, fit, top_genes)
  
} else DE_info_limma = read.csv('./../working_data/genes_ASD_DE_info_limma.csv')

# DESeq2 DE
if(!file.exists('./../working_data/genes_ASD_DE_info_DESeq2.csv')){
  counts = as.matrix(datExpr)
  rowRanges = GRanges(datProbes$chromosome_name,
                      IRanges(datProbes$start_position, width=datProbes$length),
                      strand=datProbes$strand,
                      feature_id=datProbes$ensembl_gene_id)
  
  se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
  ddsSE = DESeqDataSet(se, design =~Diagnosis_)
  
  dds = DESeq(ddsSE)
  DE_info_DESeq2 = results(dds) %>% data.frame %>% rownames_to_column(var = 'ID') %>%
                   mutate('logFC_DESeq2'=log2FoldChange, 'adj.P.Val_DESeq2'=padj) %>% 
                   dplyr::select(ID, logFC_DESeq2, adj.P.Val_DESeq2)
  
  write.csv(DE_info_DESeq2, './../working_data/genes_ASD_DE_info_DESeq2.csv', row.names = FALSE)
  
  rm(counts, rowRanges, se, ddsSE, dds)
  
} else DE_info_DESeq2 = read.csv('./../working_data/genes_ASD_DE_info_DESeq2.csv')

DE_info = DE_info_limma %>% left_join(DE_info_DESeq2, by='ID')

rm(DE_info_limma, DE_info_DESeq2)

DE Output Comparisons


Log fold change

It seems like there are two main lines of points

ggplotly(DE_info %>% ggplot(aes(logFC_DESeq2, logFC_limma)) + geom_point(alpha=0.05, aes(id=ID)) +
         geom_vline(xintercept=-log2(1.2), color='#cc0099', size=0.5) + 
         geom_hline(yintercept=-log2(1.2), color='#cc0099', size=0.5) +
         geom_vline(xintercept= log2(1.2), color='#cc0099', size=0.5) + 
         geom_hline(yintercept= log2(1.2), color='#cc0099', size=0.5) +
         geom_smooth(method=lm, se=F, color='#009999', size=0.5) + theme_minimal() + coord_fixed() +
         ggtitle(glue('LogFC (corr=',round(cor(DE_info$logFC_limma, DE_info$logFC_DESeq2),2),')')))

P-values

Although both functions use the BH method to adjust for multiple testing, the limma package seems to be more strict, assigning a high p-value to many of the points that DESeq2 considers statistically significant

DE_info %>% ggplot(aes(adj.P.Val_DESeq2, adj.P.Val_limma)) + geom_point(alpha=0.05, aes(id=ID)) + 
            geom_vline(xintercept=0.05, color='#cc0099') + geom_hline(yintercept=0.05, color='#cc0099') +
            geom_smooth(method=lm, se=F, color='#009999', size=0.5) + theme_minimal() + coord_fixed() +
            ggtitle(glue('Adjusted p-values (corr=',round(cor(DE_info$adj.P.Val_limma, 
                                                                DE_info$adj.P.Val_DESeq2),2),')'))


Statistical significance concordance using adjusted p-values<0.05 and logFC>log2(1.2):
signif_genes = data.frame('limma Significance' = DE_info$adj.P.Val_limma<0.05 & DE_info$logFC_limma>log2(1.2),
                          'DESeq2 Significance' = DE_info$adj.P.Val_DESeq2<0.05 & DE_info$logFC_DESeq2>log2(1.2),
                          'ID' = DE_info$ID)

signif_genes %>% dplyr::select(-ID) %>% table
##                   DESeq2.Significance
## limma.Significance FALSE  TRUE
##              FALSE 31568   481
##              TRUE    184  1245
signif_genes %>% dplyr::select(-ID) %>% table/nrow(DE_info)*100
##                   DESeq2.Significance
## limma.Significance      FALSE       TRUE
##              FALSE 94.2947607  1.4367644
##              TRUE   0.5496147  3.7188601
print(glue(round(sum(signif_genes$limma.Significance & !signif_genes$DESeq2.Significance)/sum(signif_genes$limma.Significance)*100,2),
           '% of the statistically significant limma genes are not significant for DESeq2.
           ', round(sum(!signif_genes$limma.Significance & signif_genes$DESeq2.Significance)/sum(signif_genes$DESeq2.Significance)*100,2),
           '% of the statistically significant DESeq2 genes are not significant for limma.'))
## 12.88% of the statistically significant limma genes are not significant for DESeq2.
## 27.87% of the statistically significant DESeq2 genes are not significant for limma.


SFARI score distribution of genes classified as significant by both methods

both_methods = data.frame('ID'=signif_genes %>% filter(limma.Significance & DESeq2.Significance) %>% dplyr::select(ID)) %>%      
               left_join(SFARI_genes, by='ID') %>% dplyr::select(ID, `gene-score`)

table(both_methods$`gene-score`, useNA='ifany')
## 
##    1    2    3    4    5    6 <NA> 
##    2    3   18   47   27    2 1146
round(table(both_methods$`gene-score`, useNA='ifany')/nrow(both_methods)*100,2)
## 
##     1     2     3     4     5     6  <NA> 
##  0.16  0.24  1.45  3.78  2.17  0.16 92.05
rm(both_methods)


SFARI score distribution of genes classified as significant by limma but not DESeq2

only_limma = data.frame('ID'=signif_genes %>% filter(limma.Significance & !DESeq2.Significance) %>% dplyr::select(ID)) %>%      
             left_join(SFARI_genes, by='ID') %>% dplyr::select(ID, `gene-score`)

table(only_limma$`gene-score`, useNA='ifany')
## 
##    2    3    4    5    6 <NA> 
##    2    3    3    5    1  170
round(table(only_limma$`gene-score`, useNA='ifany')/nrow(only_limma)*100,2)
## 
##     2     3     4     5     6  <NA> 
##  1.09  1.63  1.63  2.72  0.54 92.39
rm(only_limma)


SFARI score distribution of genes classified as significant by DESeq2 but not limma

only_DESeq2 = data.frame('ID'=signif_genes %>% filter(!limma.Significance & DESeq2.Significance) %>% dplyr::select(ID)) %>%      
             left_join(SFARI_genes, by='ID') %>% dplyr::select(ID, `gene-score`)

table(only_DESeq2$`gene-score`, useNA='ifany')
## 
##    2    3    4    5 <NA> 
##    2    2    6    3  468
round(table(only_DESeq2$`gene-score`, useNA='ifany')/nrow(only_DESeq2)*100,2)
## 
##     2     3     4     5  <NA> 
##  0.42  0.42  1.25  0.62 97.30
rm(only_DESeq2)


SFARI score distribution of genes classified as not significant by both methods

neither = data.frame('ID'=signif_genes %>% filter(!limma.Significance & !DESeq2.Significance) %>% dplyr::select(ID)) %>%      
             left_join(SFARI_genes, by='ID') %>% dplyr::select(ID, `gene-score`)

table(neither$`gene-score`, useNA='ifany')
## 
##     1     2     3     4     5     6  <NA> 
##    22    53   165   377   132    21 30799
round(table(neither$`gene-score`, useNA='ifany')/nrow(neither)*100,2)
## 
##     1     2     3     4     5     6  <NA> 
##  0.07  0.17  0.52  1.19  0.42  0.07 97.56
rm(neither, signif_genes)